[1.] Explorar dataset

1.0 Configuración inicial

Para replicar las secciones de esta clase, primero debe descargar el siguiente proyecto de R y abrir el archivo clase-13.Rproj.

Instalar/llamar las librerías de la clase

#### **Instalar/llamar las librerías de la clase**
require(pacman)
p_load(tidyverse,rio,glue,
       hexbin,
       patchwork,vip, ## plot: 
       ggrepel, ## plot: geom_text_repel
       stringi,tidytext,stopwords, ## text-data
       tidymodels,finetune) 

2.1 Leer dataset

db <- import("input/house_prices.rds") %>% select(-price) %>%
      mutate(description=str_to_lower(description),
             description=stri_trans_general(str = description , id = "Latin-ASCII")) ## limpiar caracteres especiales

## change outcome
db <- db %>% mutate(price = parse_number(gsub(",","-",price_q))) %>% select(-price_q) #This drops any non-numeric characters before or after the first number.

## count prices
db %>% count(price) %>% arrange(-n) %>% head(20)
##     price    n
## 1   966.0 1609
## 2   330.0 1600
## 3    17.8 1561
## 4  2000.0 1543
## 5   635.0 1541
## 6  1580.0 1529
## 7   783.0 1523
## 8  3000.0 1517
## 9  1250.0 1455
## 10  480.0 1432
## density plot
db %>%
ggplot(aes(lon, lat, z = price)) +
stat_summary_hex(alpha = 0.8, bins = 50) +
scale_fill_viridis_c(option = "A") + theme_light()

## Generar una columna con los tokens, aplanando el tibble en un-token por fila. 
db_tidy <- db %>%
           unnest_tokens(output = word, input = description) %>%
           anti_join(get_stopwords("es"),"word")

db_tidy %>% count(word, sort = TRUE) %>% head(20)
##            word     n
## 1   apartamento 10990
## 2          zona 10330
## 3          sala  8125
## 4        cocina  7685
## 5          bano  7161
## 6           dos  6807
## 7        cuenta  6759
## 8       comedor  6737
## 9             2  6222
## 10     edificio  6140
## 11          bao  6134
## 12         piso  5485
## 13    excelente  5270
## 14      terraza  5163
## 15       social  5125
## 16            3  4832
## 17      alcobas  4743
## 18        vista  4615
## 19 parqueaderos  4433
## 20 habitaciones  4087
## n palabras con mayor frecuencia absoluta
top_words <- db_tidy %>%
             count(word, sort = TRUE) %>%
             filter(!word %in% as.character(0:10)) %>%
             slice_max(n, n = 100) %>%
             pull(word)

top_words
##   [1] "apartamento"    "zona"           "sala"           "cocina"        
##   [5] "bano"           "dos"            "cuenta"         "comedor"       
##   [9] "edificio"       "bao"            "piso"           "excelente"     
##  [13] "terraza"        "social"         "alcobas"        "vista"         
##  [17] "parqueaderos"   "habitaciones"   "cuarto"         "servicio"      
##  [21] "venta"          "estudio"        "ubicado"        "acceso"        
##  [25] "tres"           "cerca"          "chimenea"       "pisos"         
##  [29] "principal"      "deposito"       "exterior"       "privado"       
##  [33] "oficina"        "independiente"  "garajes"        "espacios"      
##  [37] "gimnasio"       "banos"          "area"           "parqueadero"   
##  [41] "ascensor"       "salon"          "visitantes"     "sector"        
##  [45] "comunal"        "abierta"        "parque"         "chico"         
##  [49] "natural"        "amplia"         "calle"          "independientes"
##  [53] "lavanderia"     "ciudad"         "espectacular"   "closet"        
##  [57] "alcoba"         "madera"         "baos"           "zonas"         
##  [61] "integral"       "m2"             "amplio"         "vende"         
##  [65] "cada"           "mas"            "bogota"         "acabados"      
##  [69] "vigilancia"     "mts"            "24"             "vias"          
##  [73] "gas"            "ubicacion"      "seguridad"      "iluminacion"   
##  [77] "bbq"            "planta"         "amplios"        "comercial"     
##  [81] "balcon"         "carrera"        "remodelado"     "ropas"         
##  [85] "gran"           "privada"        "oficinas"       "iluminado"     
##  [89] "saln"           "nivel"          "chapinero"      "exclusivo"     
##  [93] "rentahouse"     "excelentes"     "hermoso"        "oportunidad"   
##  [97] "comerciales"    "hall"           "horas"          "norte"         
## [101] "rosales"
subset(db,id=="DgJXrP7JZuQS7trOVPj4xg==")$description
## [1] "espacio - soluciones inmobiliarias ofrece en venta este espectacular apartamneto en el exclusivo sector de el refugio.el apartamento ubicado en el piso 12 del edificio construido hace un ao cuenta con una excelente vista y por lo tanto una espectacular iluminacin natural.el inmueble cuenta con un rea de 85m2 distribuidos asi: zona social con balcn y chimenea cocina abierta tipo americano con zona de ropas baos social habitacin principal con wlak-in closet y bao privado y la habitacin secudaria con bao privado.el apartamento cuenta con puerta de sueguridad ventaneratermoacstica dos parqueaderos y un depsito.el edificio tiene lobby gimnasio totalmente dotado vigilancia privada 24/7 y terraza con parque infantil y  con nosotros para brindarte ms informacin.am (197)"
## contar conmibanciones de word y price
count_words <- db_tidy %>%
               count(word, price) %>%
               complete(word, price, fill = list(n = 0)) ## expandir todas las posibles combinaciones
count_words %>% head(20)
## # A tibble: 20 × 3
##    word       price     n
##    <chr>      <dbl> <int>
##  1 _7          17.8     1
##  2 _7         330       0
##  3 _7         480       1
##  4 _7         635       0
##  5 _7         783       0
##  6 _7         966       0
##  7 _7        1250       0
##  8 _7        1580       1
##  9 _7        2000       0
## 10 _7        3000       0
## 11 _arrienda   17.8     0
## 12 _arrienda  330       0
## 13 _arrienda  480       0
## 14 _arrienda  635       0
## 15 _arrienda  783       0
## 16 _arrienda  966       0
## 17 _arrienda 1250       0
## 18 _arrienda 1580       0
## 19 _arrienda 2000       0
## 20 _arrienda 3000       1
## obtener la frecuencia relativa word-price
word_freqs <- count_words %>%
              group_by(price) %>%
              mutate(price_sum = sum(n),
                     proportion = n / price_sum) %>%
              ungroup() %>%
              filter(word %in% top_words)
word_freqs
## # A tibble: 1,010 × 5
##    word   price     n price_sum proportion
##    <chr>  <dbl> <int>     <int>      <dbl>
##  1 24      17.8   267     55294    0.00483
##  2 24     330     299     60276    0.00496
##  3 24     480     201     57873    0.00347
##  4 24     635     191     61729    0.00309
##  5 24     783     160     59439    0.00269
##  6 24     966     196     66184    0.00296
##  7 24    1250     136     62379    0.00218
##  8 24    1580     119     65235    0.00182
##  9 24    2000     133     68475    0.00194
## 10 24    3000      93     69611    0.00134
## # … with 1,000 more rows
## Identificar palabras que aumentan o disminuyen la frecuencia con el precio.
word_model <- word_freqs %>%
              nest(data = c(price, n, price_sum, proportion)) %>%
              mutate(model = map(.x = data,
                                 .f = ~ glm(cbind(n, price_sum) ~ price , data = . , family = "binomial")),
              model = map(model, tidy)) %>%
              unnest(model) %>%
              filter(term == "price") %>%
              mutate(p.value = p.adjust(p.value)) %>%
              arrange(-estimate)
word_model
## # A tibble: 101 × 7
##    word         data              term  estimate std.error statistic  p.value
##    <chr>        <list>            <chr>    <dbl>     <dbl>     <dbl>    <dbl>
##  1 cada         <tibble [10 × 4]> price 0.000422 0.0000237     17.8  4.13e-69
##  2 cuarto       <tibble [10 × 4]> price 0.000357 0.0000169     21.1  7.37e-97
##  3 espectacular <tibble [10 × 4]> price 0.000342 0.0000228     15.0  5.74e-49
##  4 servicio     <tibble [10 × 4]> price 0.000338 0.0000171     19.8  5.58e-85
##  5 exclusivo    <tibble [10 × 4]> price 0.000331 0.0000292     11.3  8.27e-28
##  6 mas          <tibble [10 × 4]> price 0.000295 0.0000243     12.2  3.91e-32
##  7 planta       <tibble [10 × 4]> price 0.000267 0.0000267     10.0  9.39e-22
##  8 oficinas     <tibble [10 × 4]> price 0.000249 0.0000288      8.64 3.59e-16
##  9 acabados     <tibble [10 × 4]> price 0.000243 0.0000248      9.79 9.26e-21
## 10 gimnasio     <tibble [10 × 4]> price 0.000242 0.0000207     11.7  1.15e-29
## # … with 91 more rows
## plot estimate vs p-value
word_model %>%
ggplot(aes(estimate, p.value)) +
geom_vline(xintercept = 0, lty = 2, alpha = 0.7, color = "gray50") +
geom_point(color = "midnightblue", alpha = 0.8, size = 2.5) +
scale_y_log10() +
geom_text_repel(aes(label = word))
## Warning: ggrepel: 78 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## generar vector con palabras que mas aumentan o disminuyen con el precio
higher_words <- word_model %>%
                filter(p.value < 0.05) %>%
                slice_max(estimate, n = 12) %>%
                pull(word)

lower_words <- word_model %>%
               filter(p.value < 0.05) %>%
               slice_max(-estimate, n = 12) %>%
               pull(word)

## plot lower_words
word_freqs %>%
filter(word %in% lower_words) %>%
ggplot(aes(price, proportion, color = word)) +
geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(vars(word), scales = "free_y") + theme_light()

## plot higher_words
word_freqs %>%
filter(word %in% higher_words) %>%
ggplot(aes(price, proportion, color = word)) +
geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(vars(word), scales = "free_y") + theme_light()

[2.] Modelar precio de viviendas

## set train-test dataset
set.seed(123)
db_split <- db %>% mutate(price=as.factor(price)) %>% 
            initial_split(strata = price)
db_train <- training(db_split)
db_test <- testing(db_split)

## set n-folds
set.seed(234)
db_folds <- vfold_cv(data=db_train, v=5 , strata=price)
db_folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits              id   
##   <list>              <chr>
## 1 <split [9182/2298]> Fold1
## 2 <split [9183/2297]> Fold2
## 3 <split [9184/2296]> Fold3
## 4 <split [9184/2296]> Fold4
## 5 <split [9187/2293]> Fold5
## set metrics
db_metrics <- metric_set(accuracy, roc_auc, mn_log_loss)

## generar pattern para expresion regular
higher_pattern <- glue_collapse(higher_words, sep = "|")

lower_pattern <- glue_collapse(lower_words, sep = "|")

## prepare db
browseURL("https://cran.r-project.org/web/packages/recipes/recipes.pdf")
db_recipe <- recipe(formula=price ~ . , data=db_train) %>% ## En recip se detallan los pasos que se aplicarán a un conjunto de datos para prepararlo para el análisis de datos.
             update_role(id, new_role = "id") %>% ## cambiar role para variable id
             step_regex(description, pattern=higher_pattern , result="high_price_words") %>% ## generar dummy
             step_regex(description, pattern=lower_pattern , result="low_price_words") %>%  ## generar dummy
             step_rm(description) %>% ## remover description 
             step_novel(property_type) %>%
             #step_unknown(homeType) %>%
             #step_other(homeType, threshold = 0.02) %>%
             step_dummy(all_nominal_predictors()) %>%
             step_nzv(all_predictors())
db_recipe
## Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##    outcome          1
##  predictor         12
## 
## Operations:
## 
## Regular expression dummy variable using "cada|cuarto|espectacular|servicio|exclusivo|ma...
## Regular expression dummy variable using "chapinero|parqueadero|horas|calle|24|vigilanci...
## Variables removed description
## Novel factor level assignment for property_type
## Dummy variables from all_nominal_predictors()
## Sparse, unbalanced variable filter on all_predictors()
## Boosted Tree Model Specification
xgb_spec <- boost_tree(trees = 1000,
                       tree_depth = tune(),
                       min_n = tune(),
                       mtry = tune(),
                       sample_size = tune(),
                       learn_rate = tune()) %>%
            set_engine("xgboost") %>%
            set_mode("classification")
xgb_spec
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost
## workflow
xgb_word_wf <- workflow(db_recipe, xgb_spec)

## tunne hiperparametros
xgb_grid <- grid_max_entropy(tree_depth(c(5L, 10L)),
                             min_n(c(10L, 40L)),
                             mtry(c(5L, 10L)), 
                             sample_prop(c(0.5, 1.0)), 
                             learn_rate(c(-2, -1)),
                             size = 20)
xgb_grid
## # A tibble: 20 × 5
##    tree_depth min_n  mtry sample_size learn_rate
##         <int> <int> <int>       <dbl>      <dbl>
##  1          6    28     6       0.891     0.0466
##  2          6    14     6       0.952     0.0126
##  3          6    27     9       0.932     0.0660
##  4          5    36    10       0.501     0.0868
##  5         10    36     8       0.587     0.0228
##  6          9    37     9       0.856     0.0371
##  7          7    32    10       0.581     0.0420
##  8          8    26     5       0.998     0.0600
##  9         10    15     8       0.902     0.0887
## 10          7    27     6       0.690     0.0613
## 11          6    39     9       0.783     0.0306
## 12          9    20     5       0.991     0.0142
## 13          8    14    10       0.946     0.0115
## 14          5    17     5       0.778     0.0234
## 15          6    30     5       0.628     0.0108
## 16          8    19    10       0.669     0.0228
## 17          6    13     7       0.771     0.0582
## 18          5    18     7       0.999     0.0802
## 19          9    17     6       0.587     0.0195
## 20          7    26     8       0.687     0.0205
## estimate model
xgb_word_rs <- tune_race_anova(object = xgb_word_wf,
                                                resamples = db_folds,
                                                grid = xgb_grid,
                                                metrics = metric_set(mn_log_loss),
                                                control = control_race(verbose_elim = T))
xgb_word_rs
## # Tuning results
## # 5-fold cross-validation using stratification 
## # A tibble: 5 × 5
##   splits              id    .order .metrics          .notes           
##   <list>              <chr>  <int> <list>            <list>           
## 1 <split [9182/2298]> Fold1      3 <tibble [20 × 9]> <tibble [20 × 3]>
## 2 <split [9183/2297]> Fold2      1 <tibble [20 × 9]> <tibble [20 × 3]>
## 3 <split [9184/2296]> Fold3      2 <tibble [20 × 9]> <tibble [20 × 3]>
## 4 <split [9184/2296]> Fold4      5 <tibble [1 × 9]>  <tibble [1 × 3]> 
## 5 <split [9187/2293]> Fold5      4 <tibble [1 × 9]>  <tibble [0 × 3]> 
## 
## There were issues with some computations:
## 
##   - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in...   - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in...   - Warning(s) x1: There are new levels in a factor: La SalleThere are new levels in...   - Warning(s) x20: There are new levels in a factor: La SalleThere are new levels in...
## 
## Run `show_notes(.Last.tune.result)` for more information.

[3.] Desempeño del modelo

## plot model
plot_race(xgb_word_rs)

## best model
show_best(xgb_word_rs)
## # A tibble: 1 × 11
##    mtry min_n tree_depth learn_rate sample_size .metric   .estimator  mean     n
##   <int> <int>      <int>      <dbl>       <dbl> <chr>     <chr>      <dbl> <int>
## 1     8    15         10     0.0887       0.902 mn_log_l… multiclass  1.20     5
## # … with 2 more variables: std_err <dbl>, .config <chr>
## xgboost model
xgb_last <- xgb_word_wf %>%
            finalize_workflow(select_best(xgb_word_rs, "mn_log_loss")) %>%
            last_fit(db_split)
## Warning: package 'xgboost' was built under R version 4.1.2
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Modelo Norte
xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits               id              .metrics .notes   .predictions .workflow 
##   <list>               <chr>           <list>   <list>   <list>       <list>    
## 1 <split [11480/3830]> train/test spl… <tibble> <tibble> <tibble>     <workflow>
## 
## There were issues with some computations:
## 
##   - Warning(s) x1: There are new levels in a factor: Modelo Norte
## 
## Run `show_notes(.Last.tune.result)` for more information.
## min log loss
collect_predictions(xgb_last) %>%
mn_log_loss(price, `.pred_17.8`:`.pred_3000`)
## # A tibble: 1 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 mn_log_loss multiclass      1.11
## predictons vs truht value
predictions <- collect_predictions(xgb_last) %>%
               conf_mat(price, .pred_class)
predictions
##           Truth
## Prediction 17.8 330 480 635 783 966 1250 1580 2000 3000
##       17.8  289  42  11   7   4   8    4    5    1    9
##       330    53 271  57  14   9   7    2    2    3    2
##       480    10  37 193  46  17   6    7    3    3    1
##       635     7  24  64 215  44  23   13    6    4    5
##       783     6   6  22  49 218  37   15   13    2    2
##       966     4   2  12  23  39 224   51   28    7    6
##       1250    2   2   0   7  17  38  210   33   32    7
##       1580    5   6   3   3   5  14   38  251   37   20
##       2000    6   3   4   8   9  20   28   41  242   45
##       3000   10   5   7   5   3   7    7   22   44  290
predictions %>%
autoplot() + theme_light()

## ROC curve
collect_predictions(xgb_last) %>%
roc_curve(price, `.pred_17.8`:`.pred_3000`) %>%
ggplot(aes(1 - specificity, sensitivity, color = .level)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(alpha = 0.8, size = 1.2) +
coord_equal() + theme_light()

## Var importance
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15) + theme_light()